Code
from IPython.display import HTML
from plotly.io import to_htmlVisual Reporting and Storytelling
By the end of this lab, you will: 1. Load and analyze the Lightcast dataset in Spark DataFrame. 2. Create five easy and three medium-complexity visualizations using Plotly. 3. Explore salary distributions, employment trends, and job postings. 4. Analyze skills in relation to NAICS/SOC/ONET codes and salaries. 5. Customize colors, fonts, and styles in all visualizations (default themes result in a 2.5-point deduction). 6. Follow best practices for reporting on data communication.
from IPython.display import HTML
from plotly.io import to_htmlimport pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()
# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./lightcast_job_postings.csv")
# Show Schema and Sample Data
df.printSchema()
df.show(5)
root
|-- ID: string (nullable = true)
|-- LAST_UPDATED_DATE: string (nullable = true)
|-- LAST_UPDATED_TIMESTAMP: timestamp (nullable = true)
|-- DUPLICATES: integer (nullable = true)
|-- POSTED: string (nullable = true)
|-- EXPIRED: string (nullable = true)
|-- DURATION: integer (nullable = true)
|-- SOURCE_TYPES: string (nullable = true)
|-- SOURCES: string (nullable = true)
|-- URL: string (nullable = true)
|-- ACTIVE_URLS: string (nullable = true)
|-- ACTIVE_SOURCES_INFO: string (nullable = true)
|-- TITLE_RAW: string (nullable = true)
|-- BODY: string (nullable = true)
|-- MODELED_EXPIRED: string (nullable = true)
|-- MODELED_DURATION: integer (nullable = true)
|-- COMPANY: integer (nullable = true)
|-- COMPANY_NAME: string (nullable = true)
|-- COMPANY_RAW: string (nullable = true)
|-- COMPANY_IS_STAFFING: boolean (nullable = true)
|-- EDUCATION_LEVELS: string (nullable = true)
|-- EDUCATION_LEVELS_NAME: string (nullable = true)
|-- MIN_EDULEVELS: integer (nullable = true)
|-- MIN_EDULEVELS_NAME: string (nullable = true)
|-- MAX_EDULEVELS: integer (nullable = true)
|-- MAX_EDULEVELS_NAME: string (nullable = true)
|-- EMPLOYMENT_TYPE: integer (nullable = true)
|-- EMPLOYMENT_TYPE_NAME: string (nullable = true)
|-- MIN_YEARS_EXPERIENCE: integer (nullable = true)
|-- MAX_YEARS_EXPERIENCE: integer (nullable = true)
|-- IS_INTERNSHIP: boolean (nullable = true)
|-- SALARY: integer (nullable = true)
|-- REMOTE_TYPE: integer (nullable = true)
|-- REMOTE_TYPE_NAME: string (nullable = true)
|-- ORIGINAL_PAY_PERIOD: string (nullable = true)
|-- SALARY_TO: integer (nullable = true)
|-- SALARY_FROM: integer (nullable = true)
|-- LOCATION: string (nullable = true)
|-- CITY: string (nullable = true)
|-- CITY_NAME: string (nullable = true)
|-- COUNTY: integer (nullable = true)
|-- COUNTY_NAME: string (nullable = true)
|-- MSA: integer (nullable = true)
|-- MSA_NAME: string (nullable = true)
|-- STATE: integer (nullable = true)
|-- STATE_NAME: string (nullable = true)
|-- COUNTY_OUTGOING: integer (nullable = true)
|-- COUNTY_NAME_OUTGOING: string (nullable = true)
|-- COUNTY_INCOMING: integer (nullable = true)
|-- COUNTY_NAME_INCOMING: string (nullable = true)
|-- MSA_OUTGOING: integer (nullable = true)
|-- MSA_NAME_OUTGOING: string (nullable = true)
|-- MSA_INCOMING: integer (nullable = true)
|-- MSA_NAME_INCOMING: string (nullable = true)
|-- NAICS2: integer (nullable = true)
|-- NAICS2_NAME: string (nullable = true)
|-- NAICS3: integer (nullable = true)
|-- NAICS3_NAME: string (nullable = true)
|-- NAICS4: integer (nullable = true)
|-- NAICS4_NAME: string (nullable = true)
|-- NAICS5: integer (nullable = true)
|-- NAICS5_NAME: string (nullable = true)
|-- NAICS6: integer (nullable = true)
|-- NAICS6_NAME: string (nullable = true)
|-- TITLE: string (nullable = true)
|-- TITLE_NAME: string (nullable = true)
|-- TITLE_CLEAN: string (nullable = true)
|-- SKILLS: string (nullable = true)
|-- SKILLS_NAME: string (nullable = true)
|-- SPECIALIZED_SKILLS: string (nullable = true)
|-- SPECIALIZED_SKILLS_NAME: string (nullable = true)
|-- CERTIFICATIONS: string (nullable = true)
|-- CERTIFICATIONS_NAME: string (nullable = true)
|-- COMMON_SKILLS: string (nullable = true)
|-- COMMON_SKILLS_NAME: string (nullable = true)
|-- SOFTWARE_SKILLS: string (nullable = true)
|-- SOFTWARE_SKILLS_NAME: string (nullable = true)
|-- ONET: string (nullable = true)
|-- ONET_NAME: string (nullable = true)
|-- ONET_2019: string (nullable = true)
|-- ONET_2019_NAME: string (nullable = true)
|-- CIP6: string (nullable = true)
|-- CIP6_NAME: string (nullable = true)
|-- CIP4: string (nullable = true)
|-- CIP4_NAME: string (nullable = true)
|-- CIP2: string (nullable = true)
|-- CIP2_NAME: string (nullable = true)
|-- SOC_2021_2: string (nullable = true)
|-- SOC_2021_2_NAME: string (nullable = true)
|-- SOC_2021_3: string (nullable = true)
|-- SOC_2021_3_NAME: string (nullable = true)
|-- SOC_2021_4: string (nullable = true)
|-- SOC_2021_4_NAME: string (nullable = true)
|-- SOC_2021_5: string (nullable = true)
|-- SOC_2021_5_NAME: string (nullable = true)
|-- LOT_CAREER_AREA: integer (nullable = true)
|-- LOT_CAREER_AREA_NAME: string (nullable = true)
|-- LOT_OCCUPATION: integer (nullable = true)
|-- LOT_OCCUPATION_NAME: string (nullable = true)
|-- LOT_SPECIALIZED_OCCUPATION: integer (nullable = true)
|-- LOT_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
|-- LOT_OCCUPATION_GROUP: integer (nullable = true)
|-- LOT_OCCUPATION_GROUP_NAME: string (nullable = true)
|-- LOT_V6_SPECIALIZED_OCCUPATION: integer (nullable = true)
|-- LOT_V6_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
|-- LOT_V6_OCCUPATION: integer (nullable = true)
|-- LOT_V6_OCCUPATION_NAME: string (nullable = true)
|-- LOT_V6_OCCUPATION_GROUP: integer (nullable = true)
|-- LOT_V6_OCCUPATION_GROUP_NAME: string (nullable = true)
|-- LOT_V6_CAREER_AREA: integer (nullable = true)
|-- LOT_V6_CAREER_AREA_NAME: string (nullable = true)
|-- SOC_2: string (nullable = true)
|-- SOC_2_NAME: string (nullable = true)
|-- SOC_3: string (nullable = true)
|-- SOC_3_NAME: string (nullable = true)
|-- SOC_4: string (nullable = true)
|-- SOC_4_NAME: string (nullable = true)
|-- SOC_5: string (nullable = true)
|-- SOC_5_NAME: string (nullable = true)
|-- LIGHTCAST_SECTORS: string (nullable = true)
|-- LIGHTCAST_SECTORS_NAME: string (nullable = true)
|-- NAICS_2022_2: integer (nullable = true)
|-- NAICS_2022_2_NAME: string (nullable = true)
|-- NAICS_2022_3: integer (nullable = true)
|-- NAICS_2022_3_NAME: string (nullable = true)
|-- NAICS_2022_4: integer (nullable = true)
|-- NAICS_2022_4_NAME: string (nullable = true)
|-- NAICS_2022_5: integer (nullable = true)
|-- NAICS_2022_5_NAME: string (nullable = true)
|-- NAICS_2022_6: integer (nullable = true)
|-- NAICS_2022_6_NAME: string (nullable = true)
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
| ID|LAST_UPDATED_DATE|LAST_UPDATED_TIMESTAMP|DUPLICATES| POSTED| EXPIRED|DURATION| SOURCE_TYPES| SOURCES| URL|ACTIVE_URLS|ACTIVE_SOURCES_INFO| TITLE_RAW| BODY|MODELED_EXPIRED|MODELED_DURATION| COMPANY| COMPANY_NAME|COMPANY_RAW|COMPANY_IS_STAFFING|EDUCATION_LEVELS|EDUCATION_LEVELS_NAME|MIN_EDULEVELS| MIN_EDULEVELS_NAME|MAX_EDULEVELS|MAX_EDULEVELS_NAME|EMPLOYMENT_TYPE|EMPLOYMENT_TYPE_NAME|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|IS_INTERNSHIP|SALARY|REMOTE_TYPE|REMOTE_TYPE_NAME|ORIGINAL_PAY_PERIOD|SALARY_TO|SALARY_FROM| LOCATION| CITY| CITY_NAME|COUNTY| COUNTY_NAME| MSA| MSA_NAME|STATE|STATE_NAME|COUNTY_OUTGOING|COUNTY_NAME_OUTGOING|COUNTY_INCOMING|COUNTY_NAME_INCOMING|MSA_OUTGOING| MSA_NAME_OUTGOING|MSA_INCOMING| MSA_NAME_INCOMING|NAICS2| NAICS2_NAME|NAICS3| NAICS3_NAME|NAICS4| NAICS4_NAME|NAICS5| NAICS5_NAME|NAICS6| NAICS6_NAME| TITLE| TITLE_NAME| TITLE_CLEAN| SKILLS| SKILLS_NAME| SPECIALIZED_SKILLS|SPECIALIZED_SKILLS_NAME| CERTIFICATIONS| CERTIFICATIONS_NAME| COMMON_SKILLS| COMMON_SKILLS_NAME| SOFTWARE_SKILLS|SOFTWARE_SKILLS_NAME| ONET| ONET_NAME| ONET_2019| ONET_2019_NAME| CIP6| CIP6_NAME| CIP4| CIP4_NAME| CIP2| CIP2_NAME|SOC_2021_2| SOC_2021_2_NAME|SOC_2021_3| SOC_2021_3_NAME|SOC_2021_4|SOC_2021_4_NAME|SOC_2021_5|SOC_2021_5_NAME|LOT_CAREER_AREA|LOT_CAREER_AREA_NAME|LOT_OCCUPATION| LOT_OCCUPATION_NAME|LOT_SPECIALIZED_OCCUPATION|LOT_SPECIALIZED_OCCUPATION_NAME|LOT_OCCUPATION_GROUP|LOT_OCCUPATION_GROUP_NAME|LOT_V6_SPECIALIZED_OCCUPATION|LOT_V6_SPECIALIZED_OCCUPATION_NAME|LOT_V6_OCCUPATION|LOT_V6_OCCUPATION_NAME|LOT_V6_OCCUPATION_GROUP|LOT_V6_OCCUPATION_GROUP_NAME|LOT_V6_CAREER_AREA|LOT_V6_CAREER_AREA_NAME| SOC_2| SOC_2_NAME| SOC_3| SOC_3_NAME| SOC_4| SOC_4_NAME| SOC_5| SOC_5_NAME|LIGHTCAST_SECTORS|LIGHTCAST_SECTORS_NAME|NAICS_2022_2| NAICS_2022_2_NAME|NAICS_2022_3| NAICS_2022_3_NAME|NAICS_2022_4| NAICS_2022_4_NAME|NAICS_2022_5| NAICS_2022_5_NAME|NAICS_2022_6| NAICS_2022_6_NAME|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|1f57d95acf4dc67ed...| 9/6/2024| 2024-09-06 20:32:...| 0|6/2/2024| 6/8/2024| 6| [\n "Company"\n]|[\n "brassring.c...|[\n "https://sjo...| []| NULL|Enterprise Analys...|31-May-2024\n\nEn...| 6/8/2024| 6| 894731| Murphy USA| Murphy USA| false| [\n 2\n]| [\n "Bachelor's ...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| 2| 2| false| NULL| 0| [None]| NULL| NULL| NULL|{\n "lat": 33.20...|RWwgRG9yYWRvLCBBUg==|El Dorado, AR| 5139| Union, AR|20980| El Dorado, AR| 5| Arkansas| 5139| Union, AR| 5139| Union, AR| 20980| El Dorado, AR| 20980| El Dorado, AR| 44| Retail Trade| 441|Motor Vehicle and...| 4413|Automotive Parts,...| 44133|Automotive Parts ...|441330|Automotive Parts ...|ET29C073C03D1F86B4|Enterprise Analysts|enterprise analys...|[\n "KS126DB6T06...|[\n "Merchandisi...|[\n "KS126DB6T06...| [\n "Merchandisi...| []| []|[\n "KS126706DPF...|[\n "Mathematics...|[\n "KS440W865GC...|[\n "SQL (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n "45.0601",\n...|[\n "Economics, ...|[\n "45.06",\n ...|[\n "Economics",...|[\n "45",\n "27...|[\n "Social Scie...| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101011| General ERP Analy...| 2310| Business Intellig...| 23101011| General ERP Analy...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 7\n]| [\n "Artificial ...| 44| Retail Trade| 441|Motor Vehicle and...| 4413|Automotive Parts,...| 44133|Automotive Parts ...| 441330|Automotive Parts ...|
|0cb072af26757b6c4...| 8/2/2024| 2024-08-02 17:08:...| 0|6/2/2024| 8/1/2024| NULL| [\n "Job Board"\n]| [\n "maine.gov"\n]|[\n "https://job...| []| NULL|Oracle Consultant...|Oracle Consultant...| 8/1/2024| NULL| 133098|Smx Corporation L...| SMX| true| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 3| 3| false| NULL| 1| Remote| NULL| NULL| NULL|{\n "lat": 44.31...| QXVndXN0YSwgTUU=| Augusta, ME| 23011| Kennebec, ME|12300|Augusta-Watervill...| 23| Maine| 23011| Kennebec, ME| 23011| Kennebec, ME| 12300|Augusta-Watervill...| 12300|Augusta-Watervill...| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET21DDA63780A7DC09| Oracle Consultants|oracle consultant...|[\n "KS122626T55...|[\n "Procurement...|[\n "KS122626T55...| [\n "Procurement...| []| []| []| []|[\n "BGSBF3F508F...|[\n "Oracle Busi...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...| 561320|Temporary Help Se...|
|85318b12b3331fa49...| 9/6/2024| 2024-09-06 20:32:...| 1|6/2/2024| 7/7/2024| 35| [\n "Job Board"\n]|[\n "dejobs.org"\n]|[\n "https://dej...| []| NULL| Data Analyst|Taking care of pe...| 6/10/2024| 8|39063746| Sedgwick| Sedgwick| false| [\n 2\n]| [\n "Bachelor's ...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| 5| NULL| false| NULL| 0| [None]| NULL| NULL| NULL|{\n "lat": 32.77...| RGFsbGFzLCBUWA==| Dallas, TX| 48113| Dallas, TX|19100|Dallas-Fort Worth...| 48| Texas| 48113| Dallas, TX| 48113| Dallas, TX| 19100|Dallas-Fort Worth...| 19100|Dallas-Fort Worth...| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...|524291| Claims Adjusting|ET3037E0C947A02404| Data Analysts| data analyst|[\n "KS1218W78FG...|[\n "Management"...|[\n "ESF3939CE1F...| [\n "Exception R...|[\n "KS683TN76T7...|[\n "Security Cl...|[\n "KS1218W78FG...|[\n "Management"...|[\n "KS126HY6YLT...|[\n "Microsoft O...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...| 524291| Claims Adjusting|
|1b5c3941e54a1889e...| 9/6/2024| 2024-09-06 20:32:...| 1|6/2/2024|7/20/2024| 48| [\n "Job Board"\n]|[\n "disabledper...|[\n "https://www...| []| NULL|Sr. Lead Data Mgm...|About this role:\...| 6/12/2024| 10|37615159| Wells Fargo|Wells Fargo| false| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 3| NULL| false| NULL| 0| [None]| NULL| NULL| NULL|{\n "lat": 33.44...| UGhvZW5peCwgQVo=| Phoenix, AZ| 4013| Maricopa, AZ|38060|Phoenix-Mesa-Chan...| 4| Arizona| 4013| Maricopa, AZ| 4013| Maricopa, AZ| 38060|Phoenix-Mesa-Chan...| 38060|Phoenix-Mesa-Chan...| 52|Finance and Insur...| 522|Credit Intermedia...| 5221|Depository Credit...| 52211| Commercial Banking|522110| Commercial Banking|ET2114E0404BA30075|Management Analysts|sr lead data mgmt...|[\n "KS123QX62QY...|[\n "Exit Strate...|[\n "KS123QX62QY...| [\n "Exit Strate...| []| []|[\n "KS7G6NP6R6L...|[\n "Reliability...|[\n "KS4409D76NW...|[\n "SAS (Softwa...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 6\n]| [\n "Data Privac...| 52|Finance and Insur...| 522|Credit Intermedia...| 5221|Depository Credit...| 52211| Commercial Banking| 522110| Commercial Banking|
|cb5ca25f02bdf25c1...| 6/19/2024| 2024-06-19 07:00:00| 0|6/2/2024|6/17/2024| 15|[\n "FreeJobBoar...|[\n "craigslist....|[\n "https://mod...| []| NULL|Comisiones de $10...|Comisiones de $10...| 6/17/2024| 15| 0| Unclassified| LH/GM| false| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| NULL| NULL| 3|Part-time / full-...| NULL| NULL| false| 92500| 0| [None]| year| 150000| 35000|{\n "lat": 37.63...| TW9kZXN0bywgQ0E=| Modesto, CA| 6099|Stanislaus, CA|33700| Modesto, CA| 6|California| 6099| Stanislaus, CA| 6099| Stanislaus, CA| 33700| Modesto, CA| 33700| Modesto, CA| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET0000000000000000| Unclassified|comisiones de por...| []| []| []| []| []| []| []| []| []| []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...| 999999|Unclassified Indu...|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
only showing top 5 rows
EMPLOYMENT_TYPE_NAMESALARY_FROMimport plotly.express as px
# Explicitly filter dataset: remove null or zero salaries
filtered_df = df.filter((df["SALARY"].isNotNull()) & (df["SALARY"] > 0))
# Explicitly collect data directly to pandas DataFrame (bypassing distutils issue)
pdf = pd.DataFrame(
filtered_df.select("EMPLOYMENT_TYPE_NAME", "SALARY").collect(),
columns=["EMPLOYMENT_TYPE_NAME", "SALARY"]
)
# Create clear box plot visualizing salary distribution by employment type
fig = px.box(
pdf,
x="EMPLOYMENT_TYPE_NAME",
y="SALARY",
title="Salary Distribution by Employment Type",
color="EMPLOYMENT_TYPE_NAME",
color_discrete_sequence=px.colors.qualitative.Set2
)
# Explicitly enhance visualization readability
fig.update_layout(
font_family="Arial",
title_font_size=18,
xaxis_title="Employment Type",
yaxis_title="Salary",
legend_title="Employment Type",
boxmode="group"
)
HTML(to_html(fig, include_plotlyjs='cdn'))
The box plot clearly illustrates that full-time positions (>32 hours) generally offer higher median salaries compared to part-time positions. Additionally, part-time/full-time roles exhibit greater salary variability and have more extreme high-value outliers compared to purely part-time roles.
NAICS2_NAMESALARY_FROM# Filter dataset explicitly (salary > 0)
filtered_df = df.filter(df["SALARY_FROM"] > 0)
# Aggregate explicitly: select NAICS industry and salary columns, and convert to pandas dataframe
pdf = pd.DataFrame(
filtered_df.select("NAICS2_NAME", "SALARY_FROM").collect(),
columns=["NAICS2_NAME", "SALARY_FROM"]
)
# Create clear box plot comparing salary variations across industries
fig = px.box(
pdf,
x="NAICS2_NAME",
y="SALARY_FROM",
title="Salary Variations Across Industries",
color="NAICS2_NAME",
color_discrete_sequence=px.colors.qualitative.Pastel
)
# Explicitly customize graph styles for clarity and aesthetics
fig.update_layout(
font_family="Arial",
title_font_size=18,
xaxis_title="Industry (NAICS2)",
yaxis_title="Salary",
legend_title="Industry",
boxmode="group"
)
HTML(to_html(fig, include_plotlyjs='cdn'))
The box plot shows that salary distributions vary significantly across industries, with sectors like Unclassified Industry, Information, and Waste Management and Remediation Services offering higher and more dispersed salaries. In contrast, industries like Art, Entertatainment or Agriculture generally show lower and more compact salary ranges.
POSTED).POSTEDNumber of Job Postingsfrom pyspark.sql.functions import to_date, col, count, date_format
import pandas as pd
import plotly.express as px
df_with_date = df.withColumn("POSTED_DATE", to_date(col("POSTED"), "M/d/yyyy"))
# Group by week or month if all dates are too similar
df_grouped = df_with_date.withColumn("POSTED_MONTH", date_format("POSTED_DATE", "yyyy-MM")) \
.groupBy("POSTED_MONTH") \
.agg(count("*").alias("NUM_POSTINGS"))
# Convert to pandas for plotting
pdf = pd.DataFrame(df_grouped.sort("POSTED_MONTH").collect(), columns=["POSTED_MONTH", "NUM_POSTINGS"])
# Plot line chart
fig = px.line(
pdf,
x="POSTED_MONTH",
y="NUM_POSTINGS",
title="Job Postings Over Time (Monthly)",
markers=True
)
fig.update_layout(
font_family="Arial",
title_font_size=18,
xaxis_title="Month",
yaxis_title="Number of Job Postings",
plot_bgcolor="#f8f9fa"
)
HTML(to_html(fig, include_plotlyjs='cdn'))
The chart shows that the number of job postings decreased during the summer, while postings remained consistent at the beginning and end of the year. With more than 11,000 job postings each month, it reflects a high and steady demand for workers in the U.S.
TITLE_NAME).TITLE_NAMEJob Count# Group by job title and count occurrences
job_counts = df.groupBy("TITLE_NAME") \
.agg(count("*").alias("JOB_COUNT"))
# Select Top 10 job titles by count
top_10_jobs = job_counts.orderBy(col("JOB_COUNT").desc()).limit(10)
# Convert to Pandas for Plotly
pdf = pd.DataFrame(
top_10_jobs.collect(),
columns=["TITLE_NAME", "JOB_COUNT"]
)
# Plot bar chart
fig = px.bar(
pdf,
x="TITLE_NAME",
y="JOB_COUNT",
title="Top 10 Most Frequently Posted Job Titles",
color="TITLE_NAME",
color_discrete_sequence=px.colors.qualitative.Vivid
)
# Customize styles
fig.update_layout(
font_family="Arial",
title_font_size=18,
xaxis_title="Job Title",
yaxis_title="Number of Postings",
xaxis_tickangle=-45
)
HTML(to_html(fig, include_plotlyjs='cdn'))
The chart shows that Data Analyst is the most frequently posted job title, with more than double the postings compared to other roles—highlighting the strong demand for data professionals. In contrast, Data Quality Analyst appears the least among the top 10, suggesting it is a more specialized or less commonly advertised role.
REMOTE_TYPE_NAME).REMOTE_TYPE_NAMEJob Count# Group by REMOTE_TYPE_NAME and count job postings
remote_counts = df.groupBy("REMOTE_TYPE_NAME") \
.agg(count("*").alias("JOB_COUNT"))
# Convert to Pandas for visualization
pdf = pd.DataFrame(
remote_counts.collect(),
columns=["REMOTE_TYPE_NAME", "JOB_COUNT"]
)
# Create pie chart
fig = px.pie(
pdf,
names="REMOTE_TYPE_NAME",
values="JOB_COUNT",
title="Proportion of Remote vs On-site Job Postings",
color_discrete_sequence=px.colors.sequential.RdBu
)
# Customize styles
fig.update_layout(
font_family="Arial",
title_font_size=18,
legend_title="Remote Type"
)
HTML(to_html(fig, include_plotlyjs='cdn'))
The pie chart reveals that a majority of job postings (78%) are for on-site positions, while remote roles make up 17.2%, and hybrid opportunities account for only 1.55%. This suggests that despite the growth of remote work trends, many employers still prioritize in-person collaboration—possibly due to the nature of the industry, security requirements, or company culture.
IndustrySkill CountSkillfrom pyspark.sql.functions import split, explode, col, count
import pandas as pd
import plotly.express as px
# Extract and explode skills from a comma-separated list
skills_df = df.withColumn("SKILLS_NAME", explode(split(col("SKILLS_NAME"), ",\s*")))
# Group by industry and skill, then count how often each appears
industry_skills = skills_df.groupBy("NAICS2_NAME", "SKILLS_NAME") \
.agg(count("*").alias("SKILL_COUNT"))
# Limit to most common skills across industries (top 5 per industry)
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
windowSpec = Window.partitionBy("NAICS2_NAME").orderBy(col("SKILL_COUNT").desc())
top_skills_per_industry = industry_skills.withColumn("rank", row_number().over(windowSpec)) \
.filter(col("rank") <= 5)
# Convert to Pandas for Plotly
pdf = pd.DataFrame(
top_skills_per_industry.select("NAICS2_NAME", "SKILLS_NAME", "SKILL_COUNT").collect(),
columns=["NAICS2_NAME", "SKILLS_NAME", "SKILL_COUNT"]
)
fig = px.bar(
pdf,
x="NAICS2_NAME",
y="SKILL_COUNT",
color="SKILLS_NAME",
title="Top In-Demand Skills by Industry",
color_discrete_sequence=px.colors.qualitative.Pastel
)
fig.update_layout(
font_family="Arial",
title_font_size=18,
xaxis_title="Industry",
yaxis_title="Skill Count",
legend_title="Skill",
barmode="stack",
xaxis_tickangle=-45
)
HTML(to_html(fig, include_plotlyjs='cdn'))<>:6: SyntaxWarning:
invalid escape sequence '\s'
<>:6: SyntaxWarning:
invalid escape sequence '\s'
/tmp/ipykernel_10391/2673313181.py:6: SyntaxWarning:
invalid escape sequence '\s'
Across most industries, the most in-demand skills include data analysis, communication, problem-solving, and management. These skills are highly valued because they are essential for making data-driven decisions, collaborating effectively in teams, and navigating complex business challenges—regardless of industry.
ONET_NAMEMedian Salaryfrom pyspark.sql.functions import count, percentile_approx
# Group by ONET and LOT occupation names
onet_summary = df.groupBy("ONET_NAME", "LOT_OCCUPATION_NAME") \
.agg(
count("*").alias("JOB_COUNT"),
percentile_approx("SALARY_FROM", 0.5).alias("MEDIAN_SALARY")
)
# Convert to Pandas with all grouped columns
pdf = pd.DataFrame(
onet_summary.collect(),
columns=["ONET_NAME", "LOT_OCCUPATION_NAME", "JOB_COUNT", "MEDIAN_SALARY"]
)
# Create the bubble chart
fig = px.scatter(
pdf,
x="LOT_OCCUPATION_NAME",
y="MEDIAN_SALARY",
size="JOB_COUNT",
color="MEDIAN_SALARY",
title="Median Salary by ONET Occupation",
color_continuous_scale="Blues",
hover_name="ONET_NAME" # Optional: show ONET_NAME on hover
)
# Styling
fig.update_layout(
font_family="Arial",
title_font_size=18,
xaxis_title="Occupation (LOT)",
yaxis_title="Median Salary",
xaxis_tickangle=-45
)
HTML(to_html(fig, include_plotlyjs='cdn'))
In the chart, Computer Systems Engineers have a higher median salary compared to Business Intelligence Analysts, reflecting stronger compensation in technical engineering roles. Meanwhile, Data Mining Specialists show the lowest median salary among the ONET occupations, suggesting either lower demand or entry-level positioning in the market.
SOC_2021_2_NAMESOC_2021_3_NAMEfrom pyspark.sql.functions import count
import pandas as pd
import plotly.graph_objects as go
# Step 1: Count transitions (assuming each record is a job)
transitions_df = df.groupBy("SOC_2021_2_NAME", "SOC_2021_3_NAME") \
.agg(count("*").alias("TRANSITIONS"))
# Step 2: Convert to Pandas
pdf = pd.DataFrame(
transitions_df.collect(),
columns=["SOC_2021_2_NAME", "SOC_2021_3_NAME", "TRANSITIONS"]
)
# Step 3: Create unique list of node names (source + target)
labels = list(pd.unique(pdf[["SOC_2021_2_NAME", "SOC_2021_3_NAME"]].values.ravel()))
# Map each SOC name to an index for Sankey diagram
label_index = {label: i for i, label in enumerate(labels)}
# Map source/target names to indexes
pdf["source_idx"] = pdf["SOC_2021_2_NAME"].map(label_index)
pdf["target_idx"] = pdf["SOC_2021_3_NAME"].map(label_index)
fig = go.Figure(data=[go.Sankey(
node=dict(
pad=20,
thickness=20,
line=dict(color="black", width=0.5),
label=labels,
color="lightblue"
),
link=dict(
source=pdf["source_idx"],
target=pdf["target_idx"],
value=pdf["TRANSITIONS"]
)
)])
fig.update_layout(
title_text="Job Transitions Between SOC Occupation Levels",
font=dict(size=12, family="Arial")
)
HTML(to_html(fig, include_plotlyjs='cdn'))
The Sankey diagram shows a single transition from Computer and Mathematical Occupations to Mathematical Science Occupations, indicating that this is the only job flow captured in the current dataset. This may suggest that the data is focused on a specific occupational pathway—such as transitions within data or analytics-related roles—or that other occupation transitions were not recorded or available.